{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Exploration & Feature Engineering"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Data Exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loading data:\n",
    "The files can be downloaded from: http://datahack.analyticsvidhya.com/contest/practice-problem-bigmart-sales-prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Read files:\n",
    "train = pd.read_csv(\"train.csv\")\n",
    "test = pd.read_csv(\"test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(8523, 13) (5681, 12) (14204, 13)\n"
     ]
    }
   ],
   "source": [
    "#Combine test and train into one file\n",
    "train['source']='train'\n",
    "test['source']='test'\n",
    "data = pd.concat([train, test],ignore_index=True)\n",
    "print train.shape, test.shape, data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Item_Fat_Content                0\n",
       "Item_Identifier                 0\n",
       "Item_MRP                        0\n",
       "Item_Outlet_Sales            5681\n",
       "Item_Type                       0\n",
       "Item_Visibility                 0\n",
       "Item_Weight                  2439\n",
       "Outlet_Establishment_Year       0\n",
       "Outlet_Identifier               0\n",
       "Outlet_Location_Type            0\n",
       "Outlet_Size                  4016\n",
       "Outlet_Type                     0\n",
       "source                          0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Check missing values:\n",
    "data.apply(lambda x: sum(x.isnull()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Item_MRP</th>\n",
       "      <th>Item_Outlet_Sales</th>\n",
       "      <th>Item_Visibility</th>\n",
       "      <th>Item_Weight</th>\n",
       "      <th>Outlet_Establishment_Year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>14204.000000</td>\n",
       "      <td>8523.000000</td>\n",
       "      <td>14204.000000</td>\n",
       "      <td>11765.000000</td>\n",
       "      <td>14204.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>141.004977</td>\n",
       "      <td>2181.288914</td>\n",
       "      <td>0.065953</td>\n",
       "      <td>12.792854</td>\n",
       "      <td>1997.830681</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>62.086938</td>\n",
       "      <td>1706.499616</td>\n",
       "      <td>0.051459</td>\n",
       "      <td>4.652502</td>\n",
       "      <td>8.371664</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>31.290000</td>\n",
       "      <td>33.290000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>4.555000</td>\n",
       "      <td>1985.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>94.012000</td>\n",
       "      <td>834.247400</td>\n",
       "      <td>0.027036</td>\n",
       "      <td>8.710000</td>\n",
       "      <td>1987.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>142.247000</td>\n",
       "      <td>1794.331000</td>\n",
       "      <td>0.054021</td>\n",
       "      <td>12.600000</td>\n",
       "      <td>1999.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>185.855600</td>\n",
       "      <td>3101.296400</td>\n",
       "      <td>0.094037</td>\n",
       "      <td>16.750000</td>\n",
       "      <td>2004.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>266.888400</td>\n",
       "      <td>13086.964800</td>\n",
       "      <td>0.328391</td>\n",
       "      <td>21.350000</td>\n",
       "      <td>2009.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           Item_MRP  Item_Outlet_Sales  Item_Visibility   Item_Weight  \\\n",
       "count  14204.000000        8523.000000     14204.000000  11765.000000   \n",
       "mean     141.004977        2181.288914         0.065953     12.792854   \n",
       "std       62.086938        1706.499616         0.051459      4.652502   \n",
       "min       31.290000          33.290000         0.000000      4.555000   \n",
       "25%       94.012000         834.247400         0.027036      8.710000   \n",
       "50%      142.247000        1794.331000         0.054021     12.600000   \n",
       "75%      185.855600        3101.296400         0.094037     16.750000   \n",
       "max      266.888400       13086.964800         0.328391     21.350000   \n",
       "\n",
       "       Outlet_Establishment_Year  \n",
       "count               14204.000000  \n",
       "mean                 1997.830681  \n",
       "std                     8.371664  \n",
       "min                  1985.000000  \n",
       "25%                  1987.000000  \n",
       "50%                  1999.000000  \n",
       "75%                  2004.000000  \n",
       "max                  2009.000000  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Numerical data summary:\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Item_Fat_Content                 5\n",
       "Item_Identifier               1559\n",
       "Item_MRP                      8052\n",
       "Item_Outlet_Sales             3494\n",
       "Item_Type                       16\n",
       "Item_Visibility              13006\n",
       "Item_Weight                    416\n",
       "Outlet_Establishment_Year        9\n",
       "Outlet_Identifier               10\n",
       "Outlet_Location_Type             3\n",
       "Outlet_Size                      4\n",
       "Outlet_Type                      4\n",
       "source                           2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Number of unique values in each:\n",
    "data.apply(lambda x: len(x.unique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Frequency of Categories for varible Item_Fat_Content\n",
      "Low Fat    8485\n",
      "Regular    4824\n",
      "LF          522\n",
      "reg         195\n",
      "low fat     178\n",
      "Name: Item_Fat_Content, dtype: int64\n",
      "\n",
      "Frequency of Categories for varible Item_Type\n",
      "Fruits and Vegetables    2013\n",
      "Snack Foods              1989\n",
      "Household                1548\n",
      "Frozen Foods             1426\n",
      "Dairy                    1136\n",
      "Baking Goods             1086\n",
      "Canned                   1084\n",
      "Health and Hygiene        858\n",
      "Meat                      736\n",
      "Soft Drinks               726\n",
      "Breads                    416\n",
      "Hard Drinks               362\n",
      "Others                    280\n",
      "Starchy Foods             269\n",
      "Breakfast                 186\n",
      "Seafood                    89\n",
      "Name: Item_Type, dtype: int64\n",
      "\n",
      "Frequency of Categories for varible Outlet_Location_Type\n",
      "Tier 3    5583\n",
      "Tier 2    4641\n",
      "Tier 1    3980\n",
      "Name: Outlet_Location_Type, dtype: int64\n",
      "\n",
      "Frequency of Categories for varible Outlet_Size\n",
      "Medium    4655\n",
      "Small     3980\n",
      "High      1553\n",
      "Name: Outlet_Size, dtype: int64\n",
      "\n",
      "Frequency of Categories for varible Outlet_Type\n",
      "Supermarket Type1    9294\n",
      "Grocery Store        1805\n",
      "Supermarket Type3    1559\n",
      "Supermarket Type2    1546\n",
      "Name: Outlet_Type, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "#Filter categorical variables\n",
    "categorical_columns = [x for x in data.dtypes.index if data.dtypes[x]=='object']\n",
    "#Exclude ID cols and source:\n",
    "categorical_columns = [x for x in categorical_columns if x not in ['Item_Identifier','Outlet_Identifier','source']]\n",
    "#Print frequency of categories\n",
    "for col in categorical_columns:\n",
    "    print '\\nFrequency of Categories for varible %s'%col\n",
    "    print data[col].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Data Cleaning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Imputation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Orignal #missing: 2439\n",
      "Final #missing: 0\n"
     ]
    }
   ],
   "source": [
    "#Determine the average weight per item:\n",
    "item_avg_weight = data.pivot_table(values='Item_Weight', index='Item_Identifier')\n",
    "\n",
    "#Get a boolean variable specifying missing Item_Weight values\n",
    "miss_bool = data['Item_Weight'].isnull() \n",
    "\n",
    "#Impute data and check #missing values before and after imputation to confirm\n",
    "print 'Orignal #missing: %d'% sum(miss_bool)\n",
    "data.loc[miss_bool,'Item_Weight'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: item_avg_weight[x])\n",
    "print 'Final #missing: %d'% sum(data['Item_Weight'].isnull())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mode for each Outlet_Type:\n",
      "Outlet_Type\n",
      "Grocery Store         Small\n",
      "Supermarket Type1     Small\n",
      "Supermarket Type2    Medium\n",
      "Supermarket Type3    Medium\n",
      "Name: Outlet_Size, dtype: object\n",
      "\n",
      "Orignal #missing: 4016\n",
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/aarshay/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:200: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
      "  flag = np.concatenate(([True], aux[1:] != aux[:-1]))\n"
     ]
    }
   ],
   "source": [
    "#Import mode function:\n",
    "from scipy.stats import mode\n",
    "\n",
    "#Determing the mode for each\n",
    "outlet_size_mode = data.pivot_table(values='Outlet_Size', columns='Outlet_Type',aggfunc=(lambda x:mode(x).mode[0]) )\n",
    "print 'Mode for each Outlet_Type:'\n",
    "print outlet_size_mode\n",
    "\n",
    "#Get a boolean variable specifying missing Item_Weight values\n",
    "miss_bool = data['Outlet_Size'].isnull() \n",
    "\n",
    "#Impute data and check #missing values before and after imputation to confirm\n",
    "print '\\nOrignal #missing: %d'% sum(miss_bool)\n",
    "data.loc[miss_bool,'Outlet_Size'] = data.loc[miss_bool,'Outlet_Type'].apply(lambda x: outlet_size_mode[x])\n",
    "print sum(data['Outlet_Size'].isnull())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Feature Engineering:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step1: Consider combining categories in Outlet_Type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Outlet_Type\n",
       "Grocery Store         339.828500\n",
       "Supermarket Type1    2316.181148\n",
       "Supermarket Type2    1995.498739\n",
       "Supermarket Type3    3694.038558\n",
       "Name: Item_Outlet_Sales, dtype: float64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Check the mean sales by type:\n",
    "data.pivot_table(values='Item_Outlet_Sales',index='Outlet_Type')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step2: Modify Item_Visibility"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of 0 values initially: 879\n",
      "Number of 0 values after modification: 0\n"
     ]
    }
   ],
   "source": [
    "#Determine average visibility of a product\n",
    "visibility_avg = data.pivot_table(values='Item_Visibility', index='Item_Identifier')\n",
    "\n",
    "#Impute 0 values with mean visibility of that product:\n",
    "miss_bool = (data['Item_Visibility'] == 0)\n",
    "\n",
    "print 'Number of 0 values initially: %d'%sum(miss_bool)\n",
    "data.loc[miss_bool,'Item_Visibility'] = data.loc[miss_bool,'Item_Identifier'].apply(lambda x: visibility_avg[x])\n",
    "print 'Number of 0 values after modification: %d'%sum(data['Item_Visibility'] == 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count    14204.000000\n",
      "mean         1.061884\n",
      "std          0.235907\n",
      "min          0.844563\n",
      "25%          0.925131\n",
      "50%          0.999070\n",
      "75%          1.042007\n",
      "max          3.010094\n",
      "Name: Item_Visibility_MeanRatio, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "#Determine another variable with means ratio\n",
    "data['Item_Visibility_MeanRatio'] = data.apply(lambda x: x['Item_Visibility']/visibility_avg[x['Item_Identifier']], axis=1)\n",
    "print data['Item_Visibility_MeanRatio'].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 3: Create a broad category of Type of Item"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Food              10201\n",
       "Non-Consumable     2686\n",
       "Drinks             1317\n",
       "Name: Item_Type_Combined, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Item type combine:\n",
    "data['Item_Identifier'].value_counts()\n",
    "data['Item_Type_Combined'] = data['Item_Identifier'].apply(lambda x: x[0:2])\n",
    "data['Item_Type_Combined'] = data['Item_Type_Combined'].map({'FD':'Food',\n",
    "                                                             'NC':'Non-Consumable',\n",
    "                                                             'DR':'Drinks'})\n",
    "data['Item_Type_Combined'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 4: Determine the years of operation of a store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    14204.000000\n",
       "mean        15.169319\n",
       "std          8.371664\n",
       "min          4.000000\n",
       "25%          9.000000\n",
       "50%         14.000000\n",
       "75%         26.000000\n",
       "max         28.000000\n",
       "Name: Outlet_Years, dtype: float64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Years:\n",
    "data['Outlet_Years'] = 2013 - data['Outlet_Establishment_Year']\n",
    "data['Outlet_Years'].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 5: Modify categories of Item_Fat_Content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original Categories:\n",
      "Low Fat    8485\n",
      "Regular    4824\n",
      "LF          522\n",
      "reg         195\n",
      "low fat     178\n",
      "Name: Item_Fat_Content, dtype: int64\n",
      "\n",
      "Modified Categories:\n",
      "Low Fat    9185\n",
      "Regular    5019\n",
      "Name: Item_Fat_Content, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "#Change categories of low fat:\n",
    "print 'Original Categories:'\n",
    "print data['Item_Fat_Content'].value_counts()\n",
    "\n",
    "print '\\nModified Categories:'\n",
    "data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({'LF':'Low Fat',\n",
    "                                                             'reg':'Regular',\n",
    "                                                             'low fat':'Low Fat'})\n",
    "print data['Item_Fat_Content'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Low Fat       6499\n",
       "Regular       5019\n",
       "Non-Edible    2686\n",
       "Name: Item_Fat_Content, dtype: int64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Mark non-consumables as separate category in low_fat:\n",
    "data.loc[data['Item_Type_Combined']==\"Non-Consumable\",'Item_Fat_Content'] = \"Non-Edible\"\n",
    "data['Item_Fat_Content'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 6: Numerical and One-Hot Coding of Categorical variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Import library:\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "le = LabelEncoder()\n",
    "#New variable for outlet\n",
    "data['Outlet'] = le.fit_transform(data['Outlet_Identifier'])\n",
    "var_mod = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Item_Type_Combined','Outlet_Type','Outlet']\n",
    "le = LabelEncoder()\n",
    "for i in var_mod:\n",
    "    data[i] = le.fit_transform(data[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#One Hot Coding:\n",
    "data = pd.get_dummies(data, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type',\n",
    "                              'Item_Type_Combined','Outlet'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Item_Identifier               object\n",
       "Item_MRP                     float64\n",
       "Item_Outlet_Sales            float64\n",
       "Item_Type                     object\n",
       "Item_Visibility              float64\n",
       "Item_Weight                  float64\n",
       "Outlet_Establishment_Year      int64\n",
       "Outlet_Identifier             object\n",
       "source                        object\n",
       "Item_Visibility_MeanRatio    float64\n",
       "Outlet_Years                   int64\n",
       "Item_Fat_Content_0           float64\n",
       "Item_Fat_Content_1           float64\n",
       "Item_Fat_Content_2           float64\n",
       "Outlet_Location_Type_0       float64\n",
       "Outlet_Location_Type_1       float64\n",
       "Outlet_Location_Type_2       float64\n",
       "Outlet_Size_0                float64\n",
       "Outlet_Size_1                float64\n",
       "Outlet_Size_2                float64\n",
       "Outlet_Type_0                float64\n",
       "Outlet_Type_1                float64\n",
       "Outlet_Type_2                float64\n",
       "Outlet_Type_3                float64\n",
       "Item_Type_Combined_0         float64\n",
       "Item_Type_Combined_1         float64\n",
       "Item_Type_Combined_2         float64\n",
       "Outlet_0                     float64\n",
       "Outlet_1                     float64\n",
       "Outlet_2                     float64\n",
       "Outlet_3                     float64\n",
       "Outlet_4                     float64\n",
       "Outlet_5                     float64\n",
       "Outlet_6                     float64\n",
       "Outlet_7                     float64\n",
       "Outlet_8                     float64\n",
       "Outlet_9                     float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Item_Fat_Content_0</th>\n",
       "      <th>Item_Fat_Content_1</th>\n",
       "      <th>Item_Fat_Content_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Item_Fat_Content_0  Item_Fat_Content_1  Item_Fat_Content_2\n",
       "0                   1                   0                   0\n",
       "1                   0                   0                   1\n",
       "2                   1                   0                   0\n",
       "3                   0                   0                   1\n",
       "4                   0                   1                   0\n",
       "5                   0                   0                   1\n",
       "6                   0                   0                   1\n",
       "7                   1                   0                   0\n",
       "8                   0                   0                   1\n",
       "9                   0                   0                   1"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[['Item_Fat_Content_0','Item_Fat_Content_1','Item_Fat_Content_2']].head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step7: Exporting Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/aarshay/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "/Users/aarshay/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
     ]
    }
   ],
   "source": [
    "#Drop the columns which have been converted to different types:\n",
    "data.drop(['Item_Type','Outlet_Establishment_Year'],axis=1,inplace=True)\n",
    "\n",
    "#Divide into test and train:\n",
    "train = data.loc[data['source']==\"train\"]\n",
    "test = data.loc[data['source']==\"test\"]\n",
    "\n",
    "#Drop unnecessary columns:\n",
    "test.drop(['Item_Outlet_Sales','source'],axis=1,inplace=True)\n",
    "train.drop(['source'],axis=1,inplace=True)\n",
    "\n",
    "#Export files as modified versions:\n",
    "train.to_csv(\"train_modified.csv\",index=False)\n",
    "test.to_csv(\"test_modified.csv\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
